#
# dest_path is the directory where you have placed the csv file
#
#
# number of bootstrap replicates
#
n_bootstrap <- 5000
dest_path<-"/Users/haroldpollack/documents/SSA_456_2021/"
dest_filename<-"tutoring_RCT.csv"
complete_file_name<-paste0(dest_path,dest_filename)
v_data_frame<-read.csv(complete_file_name)
str(v_data_frame)
## 'data.frame': 1500 obs. of 8 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ woman : int 0 0 0 1 0 0 0 0 0 0 ...
## $ urban : int 1 1 0 1 1 1 1 1 0 0 ...
## $ Z : int 1 1 0 1 0 0 0 1 1 0 ...
## $ dose : num 6.83 4.838 0.645 6.171 5.788 ...
## $ score : num 182.4 110.9 79.1 189.7 157.6 ...
## $ score_error: num 14.075 -37.451 -27.34 16.024 -0.279 ...
## $ dose_error : num 1.33 -0.662 -1.355 2.171 2.288 ...
paste("Check baseline balance of urbanicity.")
## [1] "Check baseline balance of urbanicity."
t.test(urban ~ Z, data = v_data_frame)
##
## Welch Two Sample t-test
##
## data: urban by Z
## t = -0.18093, df = 1497.9, p-value = 0.8564
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05229245 0.04346020
## sample estimates:
## mean in group 0 mean in group 1
## 0.6618037 0.6662198
paste("Check baseline balance of gender.")
## [1] "Check baseline balance of gender."
t.test(woman ~ Z, data = v_data_frame)
##
## Welch Two Sample t-test
##
## data: woman by Z
## t = -0.36008, df = 1497.5, p-value = 0.7188
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05483663 0.03782637
## sample estimates:
## mean in group 0 mean in group 1
## 0.2931034 0.3016086
#str(v_data_frame)
#describe(v_data_frame)
paste("Regression analysis of test scores as a function of gender and urbanicity alone.")
## [1] "Regression analysis of test scores as a function of gender and urbanicity alone."
model_score_nodose<-lm(score ~ woman+urban, data = v_data_frame)
summary(model_score_nodose)
##
## Call:
## lm(formula = score ~ woman + urban, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.840 -20.510 -0.117 20.943 113.618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.829 1.423 91.247 <2e-16 ***
## woman -1.090 1.701 -0.641 0.522
## urban 14.684 1.646 8.920 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.1 on 1497 degrees of freedom
## Multiple R-squared: 0.05056, Adjusted R-squared: 0.0493
## F-statistic: 39.86 on 2 and 1497 DF, p-value: < 2.2e-16
paste("Regression analysis of tutoring dose as a function of gender and urbanicity alone.")
## [1] "Regression analysis of tutoring dose as a function of gender and urbanicity alone."
paste("Wow--women get much less tutoring. Urban residents get much more. This could be an important issue.")
## [1] "Wow--women get much less tutoring. Urban residents get much more. This could be an important issue."
model_dose_noZ<-lm(dose ~ woman+urban, data = v_data_frame)
summary(model_dose_noZ)
##
## Call:
## lm(formula = dose ~ woman + urban, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5072 -1.0886 0.0097 1.0266 4.2102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.99976 0.06707 44.73 <2e-16 ***
## woman -1.40041 0.08019 -17.46 <2e-16 ***
## urban 1.50741 0.07760 19.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.419 on 1497 degrees of freedom
## Multiple R-squared: 0.3074, Adjusted R-squared: 0.3064
## F-statistic: 332.2 on 2 and 1497 DF, p-value: < 2.2e-16
paste("Now add Z.")
## [1] "Now add Z."
paste("Regression analysis of tutoring dose as a function of treatment group Z alone.")
## [1] "Regression analysis of tutoring dose as a function of treatment group Z alone."
model_dose_Z_alone<-lm(dose ~ Z, data = v_data_frame)
summary(model_dose_Z_alone)
##
## Call:
## lm(formula = dose ~ Z, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3211 -0.9247 0.0408 0.9491 3.8225
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.60337 0.05061 51.44 <2e-16 ***
## Z 1.97236 0.07177 27.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.39 on 1498 degrees of freedom
## Multiple R-squared: 0.3352, Adjusted R-squared: 0.3348
## F-statistic: 755.3 on 1 and 1498 DF, p-value: < 2.2e-16
paste("Here's a regression analysis of tutoring dose as a function of gender, urbanicity, and treatment group Z.")
## [1] "Here's a regression analysis of tutoring dose as a function of gender, urbanicity, and treatment group Z."
paste("We want a strong coefficient on Z so we can use it as an instrumental variable for dosage.")
## [1] "We want a strong coefficient on Z so we can use it as an instrumental variable for dosage."
paste("What happens to the other coefficients?")
## [1] "What happens to the other coefficients?"
model_dose_Z<-lm(dose ~ woman+urban+Z, data = v_data_frame)
summary(model_dose_Z)
##
## Call:
## lm(formula = dose ~ woman + urban + Z, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5263 -0.6695 -0.0325 0.6511 3.2039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.02818 0.05456 37.17 <2e-16 ***
## woman -1.42028 0.05748 -24.71 <2e-16 ***
## urban 1.49814 0.05562 26.93 <2e-16 ***
## Z 1.97782 0.05253 37.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.017 on 1496 degrees of freedom
## Multiple R-squared: 0.6444, Adjusted R-squared: 0.6437
## F-statistic: 903.6 on 3 and 1496 DF, p-value: < 2.2e-16
#paste("Regression analysis of test scores as a function of gender and urbanicity, and tutoring dose.")
#paste("You might think this makes things better because dose is an important confounder. But controlling for it creates other issues. We don't know how people self-select into tutoring. So the other coefficients could still be biased. For example, the highly motivated men may be the ones with high takeup. This can upward-bias the dosage coefficient and downward-bias the male coefficient.")
#model_score<-lm(score ~ dose+woman+urban, data = v_data_frame)
#summary(model_score)
paste("Regression analysis of test scores as a function of gender, urbanicity, plus treatment group Z.")
## [1] "Regression analysis of test scores as a function of gender, urbanicity, plus treatment group Z."
paste("Note that the only directly interpretable coefficient is the coefficient on Z. Gender and urbanicity will be potentially biased if they are correlated with dose.")
## [1] "Note that the only directly interpretable coefficient is the coefficient on Z. Gender and urbanicity will be potentially biased if they are correlated with dose."
model_score_Z_and_confounders<-lm(score ~ woman+urban+Z, data = v_data_frame)
summary(model_score_Z_and_confounders)
##
## Call:
## lm(formula = score ~ woman + urban + Z, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.441 -19.808 -0.876 19.488 105.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 119.734 1.518 78.868 <2e-16 ***
## woman -1.296 1.599 -0.810 0.418
## urban 14.588 1.548 9.426 <2e-16 ***
## Z 20.551 1.462 14.062 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.3 on 1496 degrees of freedom
## Multiple R-squared: 0.1614, Adjusted R-squared: 0.1597
## F-statistic: 95.98 on 3 and 1496 DF, p-value: < 2.2e-16
paste("Regression analysis of test scores as a function of treatment group Z alone. And look how similar the Z-coefficients are. Good randomization.")
## [1] "Regression analysis of test scores as a function of treatment group Z alone. And look how similar the Z-coefficients are. Good randomization."
model_score_Z_only<-lm(score ~ Z, data = v_data_frame)
summary(model_score_Z_only)
##
## Call:
## lm(formula = score ~ Z, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -87.892 -20.188 -0.425 18.941 96.417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.008 1.060 121.68 <2e-16 ***
## Z 20.605 1.503 13.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.11 on 1498 degrees of freedom
## Multiple R-squared: 0.1114, Adjusted R-squared: 0.1108
## F-statistic: 187.8 on 1 and 1498 DF, p-value: < 2.2e-16
paste("Now let's do the IV analysis of test scores as a function of gender, urbanicity, and tutoring dose. Ignore the diagnostics for now")
## [1] "Now let's do the IV analysis of test scores as a function of gender, urbanicity, and tutoring dose. Ignore the diagnostics for now"
paste("Look what happens to the coefficients on women and urbanicity--They flip.")
## [1] "Look what happens to the coefficients on women and urbanicity--They flip."
ivreg_rct = ivreg(score~dose+woman+urban | woman+urban+Z,data=v_data_frame)
summary(ivreg_rct,diagnostics=TRUE)
##
## Call:
## ivreg(formula = score ~ dose + woman + urban | woman + urban +
## Z, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.9059 -15.1364 0.4962 15.2117 90.6720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.6589 2.0261 48.694 <2e-16 ***
## dose 10.3909 0.5783 17.968 <2e-16 ***
## woman 13.4620 1.4907 9.031 <2e-16 ***
## urban -0.9796 1.4922 -0.656 0.512
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 1 1496 1417.8 <2e-16 ***
## Wu-Hausman 1 1495 159.9 <2e-16 ***
## Sargan 0 NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.15 on 1496 degrees of freedom
## Multiple R-Squared: 0.4864, Adjusted R-squared: 0.4854
## Wald test: 156.7 on 3 and 1496 DF, p-value: < 2.2e-16
#ivreg_rct$coefficients
dose_coef<- ivreg_rct$coefficients[2]
#dose_coef
#summary(dose_coef)
#mean(dose_coef)
Two-stage least squares
#
# 2sls
#
paste("Now let's do conventional two-stage least squares and see how it compares to ivreg.")
## [1] "Now let's do conventional two-stage least squares and see how it compares to ivreg."
paste("First stage equation using Z to instrument for dose")
## [1] "First stage equation using Z to instrument for dose"
ols_first <- lm(dose ~ woman+urban+Z, data = v_data_frame)
summary(ols_first)
##
## Call:
## lm(formula = dose ~ woman + urban + Z, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5263 -0.6695 -0.0325 0.6511 3.2039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.02818 0.05456 37.17 <2e-16 ***
## woman -1.42028 0.05748 -24.71 <2e-16 ***
## urban 1.49814 0.05562 26.93 <2e-16 ***
## Z 1.97782 0.05253 37.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.017 on 1496 degrees of freedom
## Multiple R-squared: 0.6444, Adjusted R-squared: 0.6437
## F-statistic: 903.6 on 3 and 1496 DF, p-value: < 2.2e-16
v_data_frame$dose_hat <- fitted(ols_first)
paste("Second stage equation using predicted dose from the first stage")
## [1] "Second stage equation using predicted dose from the first stage"
ols_second <- lm(score ~ dose_hat+woman+urban, data = v_data_frame)
summary(ols_second)
##
## Call:
## lm(formula = score ~ dose_hat + woman + urban, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.441 -19.808 -0.876 19.488 105.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.6589 2.5890 38.107 < 2e-16 ***
## dose_hat 10.3909 0.7390 14.062 < 2e-16 ***
## woman 13.4620 1.9049 7.067 2.42e-12 ***
## urban -0.9796 1.9068 -0.514 0.608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.3 on 1496 degrees of freedom
## Multiple R-squared: 0.1614, Adjusted R-squared: 0.1597
## F-statistic: 95.98 on 3 and 1496 DF, p-value: < 2.2e-16
paste("Note that the coefficients exactly match what we got in ivreg. Standard errors a little different.")
## [1] "Note that the coefficients exactly match what we got in ivreg. Standard errors a little different."
paste("Let's run the analogous regression uninstrumented relationships.")
## [1] "Let's run the analogous regression uninstrumented relationships."
paste("Regression analysis of test scores as a function of gender and urbanicity, and uninstrumented tutoring dose.")
## [1] "Regression analysis of test scores as a function of gender and urbanicity, and uninstrumented tutoring dose."
paste("Everything will be effed-up by selection bias. ")
## [1] "Everything will be effed-up by selection bias. "
paste("You might think this makes things better because dose is an important confounder. Nope. Everything is effed-up by selection bias. We thnk we are helping by including dose in the model. But we don't know how people self-select into tutoring. So the other coefficients are biased.")
## [1] "You might think this makes things better because dose is an important confounder. Nope. Everything is effed-up by selection bias. We thnk we are helping by including dose in the model. But we don't know how people self-select into tutoring. So the other coefficients are biased."
paste("Dose coefficient is way upward biased.")
## [1] "Dose coefficient is way upward biased."
paste("This leads the woman coefficient to be way upward biased to fit the women's data, sinc most women weren't tutored.")
## [1] "This leads the woman coefficient to be way upward biased to fit the women's data, sinc most women weren't tutored."
paste("Urban coefficient is way downward biased to fit the data there.")
## [1] "Urban coefficient is way downward biased to fit the data there."
ols_unstrumented <- lm(score ~ dose+woman+urban, data = v_data_frame)
summary(ols_unstrumented)
##
## Call:
## lm(formula = score ~ dose + woman + urban, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.167 -14.059 0.428 14.774 80.192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.4226 1.5242 55.39 < 2e-16 ***
## dose 15.1367 0.3843 39.39 < 2e-16 ***
## woman 20.1081 1.3080 15.37 < 2e-16 ***
## urban -8.1334 1.2910 -6.30 3.9e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.1 on 1496 degrees of freedom
## Multiple R-squared: 0.5339, Adjusted R-squared: 0.533
## F-statistic: 571.3 on 3 and 1496 DF, p-value: < 2.2e-16
v_data_frame$score_hat <- fitted(ols_unstrumented)
#ggplot(v_data_frame,aes(dose,score_hat)+geom_point())
#
# Note to self: Go back and include the score_error term
#
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_point(aes(y = score_error), color = "yellow") +
geom_line(aes(y = score_hat), color="steelblue",size=2)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_point(aes(y = score_error), color = "yellow") +
geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(urban ~woman)
v_data_frame$F_urban <- factor(v_data_frame$urban,
levels = c(0,1),
labels = c("non-urban", "urban"))
v_data_frame$F_woman <- factor(v_data_frame$woman,
levels = c(0,1),
labels = c("Other gender", "Woman"))
v_data_frame$F_Z <- factor(v_data_frame$Z,
levels = c(0,1),
labels = c("Control group", "Treatment group"))
v_data_frame$F_urban_woman<- factor(v_data_frame$urban+2*v_data_frame$woman,
levels = c(0,1,2,3),
labels = c("Rural male", "urban male","rural woman","urban woman"))
ols_score_error <- lm(score_error ~ dose+F_urban_woman, data = v_data_frame)
summary(ols_score_error)
##
## Call:
## lm(formula = score_error ~ dose + F_urban_woman, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.522 -13.926 0.367 14.701 80.255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.02578 1.58425 -9.484 < 2e-16 ***
## dose 5.04934 0.38380 13.156 < 2e-16 ***
## F_urban_womanurban male -8.41777 1.49259 -5.640 2.03e-08 ***
## F_urban_womanrural woman 7.82581 2.14396 3.650 0.000271 ***
## F_urban_womanurban woman 0.03495 1.63625 0.021 0.982959
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.06 on 1495 degrees of freedom
## Multiple R-squared: 0.1045, Adjusted R-squared: 0.1021
## F-statistic: 43.59 on 4 and 1495 DF, p-value: < 2.2e-16
v_data_frame$score_error_hat <- fitted(ols_score_error)
ols_group_score <- lm(score ~ F_urban_woman, data = v_data_frame)
summary(ols_group_score)
##
## Call:
## lm(formula = score ~ F_urban_woman, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.365 -20.564 -0.145 20.946 112.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.430 1.580 81.901 < 2e-16 ***
## F_urban_womanurban male 15.293 1.952 7.835 8.82e-15 ***
## F_urban_womanrural woman 0.337 2.988 0.113 0.91
## F_urban_womanurban woman 13.518 2.339 5.780 9.07e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.11 on 1496 degrees of freedom
## Multiple R-squared: 0.05078, Adjusted R-squared: 0.04887
## F-statistic: 26.68 on 3 and 1496 DF, p-value: < 2.2e-16
v_data_frame$group_score_hat <- fitted(ols_group_score)
ols_score_error <- lm(score_error ~ dose+F_urban_woman, data = v_data_frame)
summary(ols_score_error)
##
## Call:
## lm(formula = score_error ~ dose + F_urban_woman, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.522 -13.926 0.367 14.701 80.255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.02578 1.58425 -9.484 < 2e-16 ***
## dose 5.04934 0.38380 13.156 < 2e-16 ***
## F_urban_womanurban male -8.41777 1.49259 -5.640 2.03e-08 ***
## F_urban_womanrural woman 7.82581 2.14396 3.650 0.000271 ***
## F_urban_womanurban woman 0.03495 1.63625 0.021 0.982959
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.06 on 1495 degrees of freedom
## Multiple R-squared: 0.1045, Adjusted R-squared: 0.1021
## F-statistic: 43.59 on 4 and 1495 DF, p-value: < 2.2e-16
v_data_frame$meaned_score_error_hat <- fitted(ols_score_error)+v_data_frame$group_score_hat
#
# interaction
#
v_data_frame$dose_Z<-v_data_frame$dose*v_data_frame$Z
v_data_frame$dose_Z_urban<-v_data_frame$dose*v_data_frame$Z*v_data_frame$urban
v_data_frame$dose_Z_woman<-v_data_frame$dose*v_data_frame$Z*v_data_frame$woman
ols_interaction <- lm(score ~ dose+urban+woman+dose_Z, data = v_data_frame)
summary(ols_interaction)
##
## Call:
## lm(formula = score ~ dose + urban + woman + dose_Z, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.656 -13.217 0.174 13.503 78.123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.1264 1.6705 44.97 <2e-16 ***
## dose 20.8850 0.6213 33.62 <2e-16 ***
## urban -13.3313 1.3179 -10.12 <2e-16 ***
## woman 25.3264 1.3339 18.99 <2e-16 ***
## dose_Z -4.1340 0.3598 -11.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.23 on 1495 degrees of freedom
## Multiple R-squared: 0.5718, Adjusted R-squared: 0.5706
## F-statistic: 499 on 4 and 1495 DF, p-value: < 2.2e-16
v_data_frame$dose_factors<-v_data_frame$dose*v_data_frame$Z*v_data_frame$F_urban_woman
## Warning in Ops.factor(v_data_frame$dose * v_data_frame$Z,
## v_data_frame$F_urban_woman): '*' not meaningful for factors
ols_interaction_full <- lm(score ~ dose+urban+woman+dose_Z+dose_Z_woman+dose_Z_urban, data = v_data_frame)
summary(ols_interaction_full)
##
## Call:
## lm(formula = score ~ dose + urban + woman + dose_Z + dose_Z_woman +
## dose_Z_urban, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.678 -13.526 0.018 13.544 80.829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.2011 1.8148 41.437 <2e-16 ***
## dose 21.2427 0.6387 33.258 <2e-16 ***
## urban -15.4597 1.7263 -8.955 <2e-16 ***
## woman 27.4290 1.7766 15.439 <2e-16 ***
## dose_Z -4.8644 0.5845 -8.322 <2e-16 ***
## dose_Z_woman -0.9098 0.5404 -1.684 0.0925 .
## dose_Z_urban 0.9346 0.5265 1.775 0.0761 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.21 on 1493 degrees of freedom
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5717
## F-statistic: 334.4 on 6 and 1493 DF, p-value: < 2.2e-16
v_data_frame$score_interaction_hat <- fitted(ols_interaction_full)
ols_score_dose_only <- lm(score ~ dose, data = v_data_frame)
summary(ols_score_dose_only)
##
## Call:
## lm(formula = score ~ dose, data = v_data_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.134 -15.801 -0.196 15.360 100.366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.3441 1.3688 69.66 <2e-16 ***
## dose 12.2510 0.3449 35.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.75 on 1498 degrees of freedom
## Multiple R-squared: 0.4572, Adjusted R-squared: 0.4568
## F-statistic: 1262 on 1 and 1498 DF, p-value: < 2.2e-16
v_data_frame$score_dose_only_hat <- fitted(ols_score_dose_only)
p_dose_only <-ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_line(aes(y = score_dose_only_hat), color="steelblue", linetype="twodash",size=2)
p_dose_only_facets<- ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_line(aes(y = score_dose_only_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban ~F_woman)
p_dose_only+ggtitle("Regressing score on dose")
p_dose_only_facets+ggtitle("Regressing score on dose--stratified by gender and urbanicity")
#
# Plots
#
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban ~F_woman)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban ~F_Z)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_woman ~F_Z)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_point(aes(y = score_error), color = "yellow") +
geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban_woman ~F_Z)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_point(aes(y = score_error), color = "yellow") +
geom_point(aes(y = meaned_score_error_hat), color = "red") +
geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban_woman ~F_Z)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
#geom_point(aes(y = score_error), color = "yellow") +
geom_point(aes(y = meaned_score_error_hat), color = "red") +
labs(title="Test scores (dots) vs. dose. Red line is fitted residual vs. dose.") +
facet_grid(F_urban_woman ~F_Z)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
#geom_point(aes(y = score_error), color = "yellow") +
geom_point(aes(y = score_interaction_hat), color = "red") +
labs(title="Test scores (dots) vs. dose. Red line is fitted score in full interaction model vs. dose.") +
facet_grid(F_urban_woman ~F_Z)
ggplot(v_data_frame, aes(x=dose)) +
geom_point(aes(y = score), color = "darkred") +
geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_wrap(~F_Z)
Bootstrapping
paste("Now let's do the Wald estimator, with and without bootstrapping.")
## [1] "Now let's do the Wald estimator, with and without bootstrapping."
v1<- aggregate(x = v_data_frame$dose,
by = list(v_data_frame$Z),
FUN = mean)
v2<- aggregate(x = v_data_frame$score,
by = list(v_data_frame$Z),
FUN = mean)
paste("Here's the Wald estimator of the value of tutoring (per dose)",(v2[2,2]-v2[1,2])/(v1[2,2]-v1[1,2]))
## [1] "Here's the Wald estimator of the value of tutoring (per dose) 10.4467522660955"
#
# Bootstrap 95% CI for Wald estimator
Wald_ratio <- function(data, indices) {
d <- data[indices,] # allows boot to select sample
v1<- aggregate(x = d$dose,
by = list(d$Z),
FUN = mean)
v2<- aggregate(x = d$score,
by = list(d$Z),
FUN = mean)
d<-(v2[2,2]-v2[1,2])/(v1[2,2]-v1[1,2])
return(d)
}
#
# bootstrapping with B replications
#
results <- boot(data=v_data_frame, statistic=Wald_ratio,
R=n_bootstrap)
results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = v_data_frame, statistic = Wald_ratio, R = n_bootstrap)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 10.44675 -0.01261882 0.6101649
paste0("Bootstrap histogram of Wald estimator for TOT treatment effect, qplot to test normality w/",n_bootstrap," replications. Yup-simulated normal data looks normal.")
## [1] "Bootstrap histogram of Wald estimator for TOT treatment effect, qplot to test normality w/5000 replications. Yup-simulated normal data looks normal."
plot(results)
#
# 95% CI and median
#
bs_estimates<-tibble(results$t)
LB<-round(0.025*n_bootstrap)
med<-round(0.50*n_bootstrap)
UB<-round(0.975*n_bootstrap)
bs_estimates<-bs_estimates[order(results$t),]
paste0("95% confidence interval of Wald estimate of TOT treatment effect: (", round(bs_estimates[LB,],digits=2)," , ",round(bs_estimates[UB,],digits=2),")")
## [1] "95% confidence interval of Wald estimate of TOT treatment effect: (9.25 , 11.6)"
paste0("Median Wald estimate of TOT treatment effect: ", round(bs_estimates[med,],digits=2)," , Almost identical IVREG coefficient ", round(dose_coef,digits=2))
## [1] "Median Wald estimate of TOT treatment effect: 10.44 , Almost identical IVREG coefficient 10.39"
paste("I was rather surprised by how similar the Wald estimator was to the TOT. The beauty of large-N simulated data that follows the normal distribution.")
## [1] "I was rather surprised by how similar the Wald estimator was to the TOT. The beauty of large-N simulated data that follows the normal distribution."
Now let’s plot some densities